################################################################################################
# This script takes the clean QV data and calculates the parameters of the model - using MLE - assuming a two-step process starting with choosing a vote class and then casting the vote(s) on the most preferred issue(s). The thresholds determining the choice of vote class along with the probabilities of making a mistake at either step of the process are estimated from the MTurk sample. Denoting by v_k the voter’s kth highest value assigned to a proposal, we conjecture that behavior can be summarized by five parameters {rho, gamma, csi*, epsilon, mu}. Now, assume we have a candidate vector {rho, gamma, csi, epsilon, mu}. 
# 
# *Note: In prior drafts, version of code, and emails we have used xi,csi, and eta interchangeably despite referring to the same parameter.  
# 
# First, in order to determine the optimal choice of vote class, we let the optimal (i.e., correct) choice be determined by the thresholds:
# - If v(4)/v(3) > rho, then R;
# - If v(4)/v(3) = rho, then R or Y;
# - If v(4)/v(3) < rho, but v(3)/v(2) > gamma, then Y;
# - If v(4)/v(3) < rho, but v(3)/v(2) = gamma, then Y or G;
# - If v(4)/v(3) < rho, v(3)/v(2) < gamma, but v(2)/v(1) > csi, then G;
# - If v(4)/v(3) < rho, v(3)/v(2) < gamma, but v(2)/v(1) = csi, then G or B;
# - If v(4)/v(3) < rho, v(3)/v(2) < gamma, and v(2)/v(1) < csi, then B;
# If the individual made the right choice then he/she is assigned a likelihood value of epsilon and otherwise, if the agent made a mistake when choosing the vote class, then the likelihood assigned is epsilon/3.
# 
# Second, given the observed vote class, with probability 1-mu all votes are cast monotonically, i.e., on the highest intensity initiatives, and otherwise the votes are cast non-monotonically. In order to estimate this, we tabulated all the ways in which the vote(s) could be cast and determined how many of these would be considered weakly monotone and how many would not. Thus, if the subject failed to cast the vote(s) monotonically he/she was assigned a probability of mu/#nonMonotone and otherwise a probability of (1-mu)/#Monotone. 
#   
#   Given the likelihood function for each agent, we took the sum of the negative log likelihoods and used this as our loss function to be minimized. In order to find the minima we used the "dfoptim" R-package which uses Nelder-Mead optimization algorithms (combined with arctan-bounds to enforce box-constraints) to minimize this function. Since the loss function itself had multiple local minima, we used a simulated annealing algorithm in order to find the best starting vector for the original data. The same starting point is used for the bootstrapped data. 
# 
# 
# Author: Luis S.
# Last Modified: 1.9.19
################################################################################################

library(dplyr)
library(gtools)
# library(nloptr) # has method of moving asymptotes (mma fn with constraints)
library(dfoptim) # uses arctan with derivative free algorithms, e.g. Nelder mead, to get box constraints

setwd("C:/Users/Luis/Dropbox/Research/SV vs QV California Experiment/Experiment Analysis & Results")

# Creates dataframe with original parameters and the bootstrapped data
set.seed(3)



##########################
#### QV Loss Function ####
##########################


calcLossScore_classAndVote <- function(sampleData, rho, gamma, eta, eps, mu)
{
  # sampleData <- QV_data
  # rho <- 1.33; gamma <- 1.18; eta <- 1.38; eps <- .49; mu = .21
  
  # Mutate data frame with calculation
  sampleData <- sampleData %>% mutate(PredClass = NA, QV1_LossScore = NA, QV2_LossScore = NA, 
                                      QV1_ClassPenalty = NA, QV2_ClassPenalty = NA, 
                                      QV1_MonoPenalty = NA, QV2_MonoPenalty = NA) 
  
  sampleData <- sampleData %>% 
    mutate(PredClass = ifelse(FirstBest_Pref > rho*SecondBest_Pref, "(1) Red",
                              ifelse(FirstBest_Pref == rho*SecondBest_Pref, "(1) Red or (2) Yellow",
                                     ifelse(SecondBest_Pref > gamma*ThirdBest_Pref, "(2) Yellow",
                                            ifelse(SecondBest_Pref == gamma*ThirdBest_Pref, "(2) Yellow or (3) Green",
                                                   ifelse(ThirdBest_Pref > eta*FourthBest_Pref, "(3) Green",
                                                          ifelse(ThirdBest_Pref == eta*FourthBest_Pref, "(3) Green or (4) Blue", "(4) Blue")))))))
  
  for(row in 1:nrow(sampleData))
  {
    sampleData$QV1_ClassPenalty[row] <- ifelse(grepl(sampleData$QV1_Class[row],sampleData$PredClass[row], fixed = T), 1 - eps, eps/3)
    sampleData$QV2_ClassPenalty[row] <- ifelse(grepl(sampleData$QV2_Class[row],sampleData$PredClass[row], fixed = T), 1 - eps, eps/3)
  }

  
  sampleData <- sampleData %>% 
    mutate(QV1_MonoPenalty = ifelse(IsMonotone_QV1 == T, 1-mu,
                                    ifelse(QV1_Class == "(1) Red", mu/NumPossErrors_Red, 
                                           ifelse(QV1_Class == "(2) Yellow", mu/NumPossErrors_Yellow, mu/NumPossErrors_Green))), 
           QV2_MonoPenalty = ifelse(IsMonotone_QV2 == T, 1-mu,
                                    ifelse(QV2_Class == "(1) Red", mu/NumPossErrors_Red, 
                                           ifelse(QV2_Class == "(2) Yellow", mu/NumPossErrors_Yellow, mu/NumPossErrors_Green)))) %>%
    mutate(QV1_LossScore = QV1_ClassPenalty*QV1_MonoPenalty, QV2_LossScore = QV2_ClassPenalty*QV2_MonoPenalty)
  
  # log the loss scores
  
  log_QV1_LossScore <- log(sampleData$QV1_LossScore)
  log_QV2_LossScore <- log(sampleData$QV2_LossScore)
  
  return(c(-sum(log_QV1_LossScore),-sum(log_QV2_LossScore)))
}


###################################
#### QV Dataframe Manipulation ####
###################################


refsNotAbstained <- function(subjectData)
{
  refs <- c()
  
  if(subjectData$Ref1_BilingualEduc != "Abstain"){ refs <- c(refs, "BilingualEduc")}
  if(subjectData$Ref2_TeacherTenure != "Abstain"){ refs <- c(refs, "TeacherTenure")}
  if(subjectData$Ref3_PublicBonds != "Abstain"){ refs <- c(refs, "PublicBonds")}
  if(subjectData$Ref4_Immigration != "Abstain"){ refs <- c(refs, "Immigration")}
  
  return(refs) 
}

IsVoteMonotone <- function(candidateRefs, Prefs, Refs)
{
  
  indexes <- c()
  
  # find the pref intensities that correspond to candidate refs 
  for(i in 1:length(candidateRefs)) { indexes <- c(indexes,which(Refs == candidateRefs[i])) }
  candidatePrefs <- Prefs[indexes]
  
  # sort candidate prefs and refs in decreasing order
  candidateRefs <- candidateRefs[order(candidatePrefs, decreasing = T)]
  candidatePrefs <- candidatePrefs[order(candidatePrefs, decreasing = T)]
  
  # iterate through the candidates and check that there is no issue with more points that wasnt chosen
  maxChecks <- length(candidateRefs)
  isMonotone <- T
  
  for(iter in 1:maxChecks)
  {
    currentPref <- candidatePrefs[iter] 
    currentRef <- candidateRefs[iter]
    
    if(max(Prefs) > currentPref) { isMonotone <- F } else {
      # remove current ones from consideration
      
      alreadyCheckedIndex <- which(Refs == currentRef)
      Prefs <- Prefs[-alreadyCheckedIndex]
      Refs <- Refs[-alreadyCheckedIndex]
    }
    
    if(!isMonotone) { break()} # once you find a monotonicity mistake, isMonotone == F, exit loop
    
  }
  
  return(isMonotone)
}


#--- Sort Issues and Prefs by Most Preferred to Least Preferred AND Determine if it meets monotonicity under Red, Yellow, Green Votes (Blue is met by design)

QV_data <- read.csv("California Stage 2 - SV & QV Data/Clean Data/California_MTurk_QV_Stage_2_clean&Recoded_AllPos.csv", header = T, stringsAsFactors = F)

QV_data[is.na(QV_data)] <- 0

QV_data <- QV_data %>% select(-matches("Abstain")) %>%
  mutate(FirstBest_Ref = NA, FirstBest_Pref = NA, SecondBest_Ref = NA, SecondBest_Pref = NA, 
         ThirdBest_Ref = NA, ThirdBest_Pref = NA, FourthBest_Ref = NA, FourthBest_Pref = NA,
         NumAbstains = NA, HasTies = NA, IsMonotone_QV1 = NA, IsMonotone_QV2 = NA,
         NumPossCorrect_Red = NA, NumPossCorrect_Yellow = NA, NumPossCorrect_Green = NA,
         NumPossErrors_Red = NA, NumPossErrors_Yellow = NA, NumPossErrors_Green = NA)

for(row in 1:nrow(QV_data))
{
  
  #--- Determine number of ways in which subject could make monotonicity mistake under feasible classes

  # find number of abstains in case we want to restrict epsilon to only those that dont force a vote on abstain
  voteDirection <- c(QV_data$Ref1_BilingualEduc[row], QV_data$Ref2_TeacherTenure[row], QV_data$Ref3_PublicBonds[row], QV_data$Ref4_Immigration[row]) #must match ref list order
  RefList <- c("BilingualEduc", "TeacherTenure", "PublicBonds", "Immigration") # must match vote direction order
  PrefStrList <- c(QV_data$EducationPref[row], QV_data$TeachersPref[row], QV_data$BondsPref[row], QV_data$ImmigrationPref[row]) # must match order of two above
  
  QV_data$NumAbstains[row] <- length(which(voteDirection == "Abstain")) # count number of abstains
  
  # Determine whether subject has ties
  if(length(unique(PrefStrList)) < 4) 
  {
    QV_data$HasTies[row] <- T
    
    # find all posible ways to cast the votes for a given rule
    feasibleRed <- combinations(r = 1, n = 4, v = RefList)
    feasibleYellow <- combinations(r = 2, n = 4, v = RefList)
    feasibleGreen <- combinations(r = 3, n = 4, v = RefList)
    
    # create vectors to store results
    resultsRed <- c()
    resultsYellow <- c()
    resultsGreen <- c()
    
    # iterate and check
    for(candidate in 1:nrow(feasibleRed)) { resultsRed <- c(resultsRed,IsVoteMonotone(feasibleRed[candidate,], PrefStrList, RefList)) }
    for(candidate in 1:nrow(feasibleYellow)) { resultsYellow <- c(resultsYellow,IsVoteMonotone(feasibleYellow[candidate,], PrefStrList, RefList)) }
    for(candidate in 1:nrow(feasibleGreen)) { resultsGreen <- c(resultsGreen,IsVoteMonotone(feasibleGreen[candidate,], PrefStrList, RefList)) }

    # save results
    QV_data$NumPossCorrect_Red[row] <- length(resultsRed[resultsRed == T]) 
    QV_data$NumPossErrors_Red[row] <- length(resultsRed[resultsRed == F]) 
    
    QV_data$NumPossCorrect_Yellow[row] <- length(resultsYellow[resultsYellow == T]) 
    QV_data$NumPossErrors_Yellow[row] <- length(resultsYellow[resultsYellow == F])
    
    QV_data$NumPossCorrect_Green[row] <- length(resultsGreen[resultsGreen == T]) 
    QV_data$NumPossErrors_Green[row] <- length(resultsGreen[resultsGreen == F])
    
    
  } else {
    
    QV_data$HasTies[row] <- F
    
    QV_data$NumPossCorrect_Red[row] <- 1
    QV_data$NumPossErrors_Red[row] <- 3
    
    QV_data$NumPossCorrect_Yellow[row] <- 1
    QV_data$NumPossErrors_Yellow[row] <- 5
    
    QV_data$NumPossCorrect_Green[row] <- 1
    QV_data$NumPossErrors_Green[row] <- 3
  }
  
  #--- Determine if agent is monotone wrt to chosen class 
  observedVote_QV1 <- data.frame(Issues = c("BilingualEduc", "TeacherTenure", "PublicBonds", "Immigration"),
                                 Votes = c(QV_data$QV1_Vote_BilingualEduc[row], QV_data$QV1_Vote_TeacherTenure[row], 
                                           QV_data$QV1_Vote_PublicBonds[row], QV_data$QV1_Vote_Immigration[row]), 
                                 stringsAsFactors = F) %>% filter(Votes == 1)
  
  observedVote_QV2 <- data.frame(Issues = c("BilingualEduc", "TeacherTenure", "PublicBonds", "Immigration"),
                                 Votes = c(QV_data$QV2_Vote_BilingualEduc[row], QV_data$QV2_Vote_TeacherTenure[row], 
                                           QV_data$QV2_Vote_PublicBonds[row], QV_data$QV2_Vote_Immigration[row]), 
                                 stringsAsFactors = F) %>% filter(Votes == 1)
  
  
  QV_data$IsMonotone_QV1[row] <- IsVoteMonotone(observedVote_QV1$Issues, PrefStrList, RefList)
  
  QV_data$IsMonotone_QV2[row] <- IsVoteMonotone(observedVote_QV2$Issues, PrefStrList, RefList)
  
  
  
  #--- Determine first, second, third, and fourth best ---
  
  # Determine first best, store in dataframe, and remove from consideration
  nextRefIndex <- which(PrefStrList == max(PrefStrList))
  nextRefIndex <- ifelse(length(nextRefIndex) == 1, nextRefIndex, sample(nextRefIndex, 1, prob = rep(1/length(nextRefIndex),length(nextRefIndex))))
  
  QV_data$FirstBest_Ref[row] <- RefList[nextRefIndex]
  QV_data$FirstBest_Pref[row] <- PrefStrList[nextRefIndex]
  
  removeNextRefIndex <- !RefList %in% RefList[nextRefIndex]
  RefList <- RefList[removeNextRefIndex]
  PrefStrList <- PrefStrList[removeNextRefIndex]
  
  # Determine second best, store in dataframe, and remove from consideration
  nextRefIndex <- which(PrefStrList == max(PrefStrList))
  nextRefIndex <- ifelse(length(nextRefIndex) == 1, nextRefIndex, sample(nextRefIndex, 1, prob = rep(1/length(nextRefIndex),length(nextRefIndex))))
  
  QV_data$SecondBest_Ref[row] <- RefList[nextRefIndex]
  QV_data$SecondBest_Pref[row] <- PrefStrList[nextRefIndex]
  
  removeNextRefIndex <- !RefList %in% RefList[nextRefIndex]
  RefList <- RefList[removeNextRefIndex]
  PrefStrList <- PrefStrList[removeNextRefIndex]
  
  # Determine third best, store in dataframe, and remove from consideration
  nextRefIndex <- which(PrefStrList == max(PrefStrList))
  nextRefIndex <- ifelse(length(nextRefIndex) == 1, nextRefIndex, sample(nextRefIndex, 1, prob = rep(1/length(nextRefIndex),length(nextRefIndex))))
  
  QV_data$ThirdBest_Ref[row] <- RefList[nextRefIndex]
  QV_data$ThirdBest_Pref[row] <- PrefStrList[nextRefIndex]
  
  removeNextRefIndex <- !RefList %in% RefList[nextRefIndex]
  RefList <- RefList[removeNextRefIndex]
  PrefStrList <- PrefStrList[removeNextRefIndex]
  
  # Store Fourth Best
  QV_data$FourthBest_Ref[row] <- RefList
  QV_data$FourthBest_Pref[row] <- PrefStrList
  
}



####################################
#### QV Model - MLE Param Estim ####
####################################


#--- Model - MLE Estimation 


statModel_classAndVote <- function(par, sampleData, QV)
{
  # rho <- par[1] # min ratio of firstbest and secondbest for Red (1) class
  # gamma <- par[2] # min ratio of secondbest and thirdbest for Yellow (2) class
  # eta <- par[3] # min ratio of thirdbest and fourthbest for Green (3) class
  # eps <- par[4] # prob of choosing mistaking voting clas
  # mu <- par[5] # prob of mistakenly casting votes given class
  
  lossScore <- calcLossScore_classAndVote(sampleData,rho = par[1], gamma = par[2], eta = par[3], eps = par[4], mu = par[5])
  
  return(lossScore[QV])
}

# #--- Find Global Min
# library(GenSA) # simulated annealing optimization
# startingParam_classAndVote <- c(1.25,1.18,1.44,.49,.26)
# globalMinParams <- GenSA(par = startingParam_classAndVote, fn = statModel_classAndVote, sampleData = QV_data, QV = 2,
#                          lower = c(1,1,1,.05,.05), upper = c(2.5,2.5,2,.95,.95), control = list(smooth = F, max.call = 2000, seed = 855))


startingParam_classAndVote_QV1 <- c(1.33,1.18,1.39,.5,.21)
SampleParamMLE_classAndVote_QV1 <- nmkb(par = startingParam_classAndVote_QV1, fn = statModel_classAndVote, sampleData = QV_data, QV = 1,
                                        lower = c(1,1,1,.05,.05), upper = c(2.5,2.5,2,.95,.95))

startingParam_classAndVote_QV2 <- c(1.25,1.18,1.45,.49,.26)
SampleParamMLE_classAndVote_QV2 <- nmkb(par = startingParam_classAndVote_QV2, fn = statModel_classAndVote, sampleData = QV_data, QV = 2,
                                        lower = c(1,1,1,.05,.05), upper = c(2.5,2.5,2,.95,.95))


# Best QV1: 1.3278131 1.1794642 1.3881409 0.4984003 0.2108411
# Best QV2: 1.2510974 1.1809943 1.4491084 0.4919933 0.2619771

#####################################
#### QV Bootstrapped Param Calc. ####
#####################################

set.seed(7)
sampleSize <- nrow(QV_data)

# Creates dataframe with original parameters and the bootstrapped data
bootstrapIterations <- 10000

QV_ParamEstimates <- data.frame(Round = seq(0,bootstrapIterations,1), 
                                QV1_Rho = NA, QV1_Gamma = NA, QV1_Eta = NA, QV1_Eps = NA, QV1_Mu = NA, 
                                QV2_Rho = NA, QV2_Gamma = NA, QV2_Eta = NA, QV2_Eps = NA, QV2_Mu = NA)


# This loop bootstraps a new sample for QV of the same size using sampling w/ replacement
for(bsCount in 1:nrow(QV_ParamEstimates)) 
{
  print(bsCount-1)
  
  # Select a random sample of rows from the original dataset and create the bootstapped dataset
  randRowNum <- sample(1:sampleSize, sampleSize, replace = T)
  BS_data <- QV_data[randRowNum,]
  
  # Number the rows to avoid conflicts
  row.names(BS_data) <- seq(1,nrow(BS_data),1)
  
  # for the first iteration, calculate the parameters using the original data
  if(bsCount == 1){BS_data <- QV_data}
  
  # get estimates
  estimate_QV1 <- nmkb(par = startingParam_classAndVote_QV1, fn = statModel_classAndVote, sampleData = BS_data, QV = 1,
                       lower = c(1,1,1,.05,.05), upper = c(2.5,2.5,2,.95,.95))
  estimate_QV2 <- nmkb(par = startingParam_classAndVote_QV2, fn = statModel_classAndVote, sampleData = BS_data, QV = 2,
                       lower = c(1,1,1,.05,.05), upper = c(2.5,2.5,2,.95,.95))
  
  # save estimates
  QV_ParamEstimates$QV1_Rho[bsCount] <- estimate_QV1$par[1]
  QV_ParamEstimates$QV1_Gamma[bsCount] <- estimate_QV1$par[2]
  QV_ParamEstimates$QV1_Eta[bsCount] <- estimate_QV1$par[3]
  QV_ParamEstimates$QV1_Eps[bsCount] <- estimate_QV1$par[4]
  QV_ParamEstimates$QV1_Mu[bsCount] <- estimate_QV1$par[5]
  
  QV_ParamEstimates$QV2_Rho[bsCount] <- estimate_QV2$par[1]
  QV_ParamEstimates$QV2_Gamma[bsCount] <- estimate_QV2$par[2]
  QV_ParamEstimates$QV2_Eta[bsCount] <- estimate_QV2$par[3]
  QV_ParamEstimates$QV2_Eps[bsCount] <- estimate_QV2$par[4]
  QV_ParamEstimates$QV2_Mu[bsCount] <- estimate_QV2$par[5]
  
}

write.csv(QV_ParamEstimates, "Misc - SV & QV Descriptive Model Parameter Estimation/QV_ParamEstimateMLE_classAndVote.csv", row.names = F)


#--- Save Confidence Intervals

# QV_ParamEstimates <- read.csv("Misc - SV & QV Descriptive Model Parameter Estimation/QV_ParamEstimateMLE_classAndVote.csv", header = T, stringsAsFactors = F)

ParamsWithCIs <- QV_ParamEstimates %>% filter(Round == 0)

getCI <- function(dataVector, conf_level)
{
  # dataVector <- c(1,5,2,4,5)
  # conf_level <- .8
  
  totalObs <- length(dataVector)
  
  dataVector <- sort(dataVector)
  
  index <- round(conf_level*totalObs)
  
  return(dataVector[index])
}

Simulated_lowerCI <- sapply(QV_ParamEstimates %>% filter(Round > 0), getCI, conf_level = .025)
Simulated_upperCI <- sapply(QV_ParamEstimates %>% filter(Round > 0), getCI, conf_level = .975)

ParamsWithCIs <- as.data.frame(t(ParamsWithCIs)) %>% tibble::rownames_to_column(var = "Variable") %>% rename(Estimate = V1)
ParamsWithCIs$LowerCI <- Simulated_lowerCI  
ParamsWithCIs$UpperCI <- Simulated_upperCI

ParamsWithCIs <- ParamsWithCIs[-1,]

write.csv(ParamsWithCIs, "Misc - SV & QV Descriptive Model Parameter Estimation/QV_ParamEstimateWithCIs.csv", row.names = F)

